helper function
loadRData <- function(fileName){
#loads an RData file, and returns it
load(fileName)
get(ls()[ls() != "fileName"])
}
The following is the same same data set as in hQTL analysis as in https://rss.onlinelibrary.wiley.com/doi/10.1111/rssb.12411?af=R, Section 6: APPLICATION EXAMPLE: BIOLOGICAL HIGH-THROUGHPUT DATA. It is taken from Grubert et al..
chr1_full_data_grubert <- loadRData(here("data/hqtl_chrom1_chrom2/snps1.filtered.allGT.txt.rda"))
Sample names (The population sample is 76 individuals).
MatrixEQTL::colnames(chr1_full_data_grubert)[1:10]
## [1] "NA19160" "NA18861" "NA18862" "NA18909" "NA18907" "NA18517" "NA19119"
## [8] "NA19116" "NA19114" "NA19171"
SNPs
MatrixEQTL::rownames(chr1_full_data_grubert)[1:10]
## [1] "rs192328835" "rs74970982" "rs56992750" "rs186063952" "rs143773730"
## [6] "rs143777184" "rs147215883" "rs185237834" "rs189906733" "rs191297051"
top-left submatrix
chr1_full_data_grubert[[1]][1:10,1:10]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 0 0 0 0 0 0 0 0 0 0
## [2,] 2 1 1 2 1 2 0 2 1 1
## [3,] 0 1 0 1 0 0 0 1 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 1 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0
## [7,] 0 1 0 0 0 0 0 0 0 0
## [8,] 0 1 1 0 0 0 0 0 0 0
## [9,] 0 0 0 0 0 0 0 0 1 1
## [10,] 0 1 1 0 0 0 0 0 0 0
0 means no alternative allele, 1 means 1
alternative allele, 2 means 2 alternative allele,
The p-values are pre-calculated with MatrixEQTL package
ref.
It is basically row-wise t-test with covariate adjustment. Here
is the code. The pre-calculated file was. downloaded from here.
chr1_pvalues_grubert <- readRDS(here("data/hqtl_chrom1_chrom2/chr1_subset.Rds"))
head(chr1_pvalues_grubert)
## # A tibble: 6 × 6
## SNP gene beta tstat pvalue FDR
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 rs115505656 13779 2.37 21.3 2.80e-33 4.40e-24
## 2 rs75919090 13779 2.37 21.3 2.80e-33 4.40e-24
## 3 rs79217730 13779 2.37 21.3 2.80e-33 4.40e-24
## 4 rs78506386 13779 2.37 21.3 2.80e-33 4.40e-24
## 5 rs114225097 13779 2.37 21.3 2.80e-33 4.40e-24
## 6 rs2075985 29704 2.72 21.1 4.50e-33 5.90e-24
SNP,gene are primary keys.
chr1_pvalues_grubert %>%
group_by(SNP,gene) %>%
filter(n()>1)
## # A tibble: 0 × 6
## # Groups: SNP, gene [0]
## # … with 6 variables: SNP <chr>, gene <int>, beta <dbl>, tstat <dbl>,
## # pvalue <dbl>, FDR <dbl>
We consider a second data set from (Boca and Leek)[https://github.com/SiminaB/Fdr-regression/blob/master/BMI%20GIANT%20meta-analysis/BMI_GIANT_GWAS_results_Scott_theoretical.RData].
pvalues_maf_boca <- loadRData(here("data/BMI_GIANT_GWAS.RData"))
head(pvalues_maf_boca)
## # A tibble: 6 × 9
## SNP A1 A2 Freq_MAF_Hapmap b se p N
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 rs1000000 G A 0.367 -0.0001 0.0043 0.981 233572
## 2 rs10000010 T C 0.425 0.0022 0.0029 0.438 339148
## 3 rs10000012 G C 0.192 -0.0096 0.0053 0.0701 236095
## 4 rs10000013 A C 0.167 0.0096 0.0043 0.0256 236048
## 5 rs10000017 C T 0.233 0.0038 0.0045 0.398 235308
## 6 rs10000023 G T 0.408 0.0023 0.0037 0.534 236022
## # … with 1 more variable: Freq_MAF_Int_Hapmap <fct>
TODO why are there multiple rows per SNP?
pvalues_maf_boca %>%
group_by(SNP) %>%
filter(n()>1)
We calculate the standard deviation for the beta estimate of the
effect size from the beta_sefrom beta and
tstat.
chr1_pvalues_grubert <- chr1_pvalues_grubert %>%
mutate(beta_se = beta/tstat,
tstat = NULL,
FDR = NULL)
We harmonise the two data sets.
pvalues_maf_boca <- pvalues_maf_boca %>%
dplyr::rename(beta = b, beta_se = se, pvalue = p, maf_sample = Freq_MAF_Hapmap) %>%
mutate(Freq_MAF_Int_Hapmap = NULL,
A1 = NULL,
A2 = NULL)
colnames(chr1_pvalues_grubert)
## [1] "SNP" "gene" "beta" "pvalue" "beta_se"
colnames(pvalues_maf_boca)
## [1] "SNP" "maf_sample" "beta" "beta_se" "pvalue"
## [6] "N"
TODO how is Freq_MAF_Hapmap calculated? Is it from sample?
ensembl <- biomaRt::useMart("ENSEMBL_MART_SNP", dataset = "hsapiens_snp")
Download minor_allele_freq for first data set
chr1_maf_grubert_downloaded <- biomaRt::getBM(
attributes = c("refsnp_id", "minor_allele_freq"),
filters = "snp_filter", values = "rs1620325",#unique(chr1_pvalues_grubert$SNP),
mart = ensembl, uniqueRows = TRUE
)
writeRDS(chr1_maf_grubert_downloaded, here("data/downloaded_covariates/chr1_maf.Rds"))
Load the pre-downloaded maf
chr1_maf_grubert_downloaded <- readRDS(here("data/downloaded_covariates/chr1_maf.Rds"))
join, filter out rows where downloaded maf is missing
chr1_maf_grubert_downloaded <- chr1_maf_grubert_downloaded %>%
dplyr::rename(maf_downloaded = minor_allele_freq)
chr1_pvalues_grubert <- chr1_pvalues_grubert %>%
inner_join(chr1_maf_grubert_downloaded, by = c("SNP" = "refsnp_id")) %>%
drop_na(maf_downloaded)
ggplot(chr1_pvalues_grubert, aes(x = maf_downloaded)) +
geom_histogram(fill = "lightblue", color = "black", bins = 30) +
labs(title = "Histogram of MAF Download",
x = "MAF Download",
y = "Frequency")
range(chr1_pvalues_grubert$maf_downloaded)
## [1] 0.0 0.5
chr1_maf <- biomaRt::getBM(
attributes = c("refsnp_id", "minor_allele_freq"),
filters = "snp_filter", values = unique(pvalues_maf_boca$SNP),
mart = ensembl, uniqueRows = TRUE
)
This code is directly taken from here: http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/faq.html.
Explenation: each entry of the matrix can take the values
0,1,2.
rowMeans(slice,na.rm=TRUE)/2=rowSum(slice,na.rm=TRUE)/(2*76).
maf.list = vector('list', length(chr1_full_data_grubert))
for(sl in seq_along(chr1_full_data_grubert)) {
slice = chr1_full_data_grubert[[sl]];
maf.list[[sl]] = rowMeans(slice,na.rm=TRUE)/2;
maf.list[[sl]] = pmin(maf.list[[sl]],1-maf.list[[sl]]);
}
We assign the maf to the snp identifier TODO mistake in this cell?
chr1_maf_sample <- data.frame(
SNP = MatrixEQTL::rownames(chr1_full_data_grubert),
maf_sample = unlist(maf.list)
)
chr1_pvalues_grubert <- chr1_pvalues_grubert %>%
inner_join(chr1_maf_sample, by = "SNP")
ggplot(chr1_pvalues_grubert, aes(x = maf_sample)) +
geom_histogram(fill = "lightblue", color = "black", bins = 30) +
labs(title = "Histogram of MAF Sample",
x = "MAF Sample",
y = "Frequency")
This is plausible and checks out with what we would expect from the datq
set description.
range(chr1_pvalues_grubert$maf_sample)
## [1] 0.05263158 0.50000000
already pre calculated
ggplot(pvalues_maf_boca, aes(x = maf_sample)) +
geom_histogram(fill = "lightblue", color = "black", bins = 30) +
labs(title = "Histogram of MAF Sample",
x = "MAF Sample",
y = "Frequency")
The boca leek data set seems to be not filtered for significant
SNPs.
range(pvalues_maf_boca$maf_sample)
## [1] 0.0 0.5
The sample MAF does not go below 0.05.
ggplot(chr1_pvalues_grubert,
aes(x = maf_sample, y = maf_downloaded)) +
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex(bins = 60) +
geom_abline(intercept=0, slope=1, color="red") +
labs(x = "MAF Sample",
y = "MAF Downloaded")
ggplot(chr1_pvalues_grubert,
aes(x = rank(maf_sample), y = rank(maf_downloaded))) +
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex(bins = 60) +
geom_abline(intercept=0, slope=1, color="red") +
labs(x = "MAF Sample Rank",
y = "MAF Downloaded Rank")
# compare minor alle frequency and standard deviation of effect size
estimate
The beta_se estimate is calculated from the pre-calculated p-value. So it is assumed, that the pre-calculated p-value is correct. This is plausible: std should decrease with smaller beta_se.
ggplot(chr1_pvalues_grubert,
aes(x = maf_sample, y = beta_se)) +
scale_y_log10()+
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex() #bins = 60
ggplot(chr1_pvalues_grubert,
aes(x = maf_downloaded, y = beta_se)) +
scale_y_log10()+
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex() #bins = 60
So the sample maf looks more informative
The beta_se estimate is calculated from the pre-calculated p-value. So it is assumed, that the pre-calculated p-value is correct.
ggplot(pvalues_maf_boca,
aes(x = maf_sample, y = beta_se)) +
scale_y_log10()+
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex() #bins = 60
The following plot makes sense. In chr1_pvalues_grubert
we have filtered out SNPs with low MAF and high pvalues.
ggplot(chr1_pvalues_grubert,
aes(x = beta_se , y = pvalue)) +
scale_y_log10()+
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex() #bins = 60
TODO why is beta_se independent of
pvalue.
ggplot(pvalues_maf_boca,
aes(x = beta_se, y = pvalue)) +
scale_y_log10()+
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex() #bins = 60
Reproducing what happens for data set 1.
ggplot(pvalues_maf_boca %>%
filter(pvalue <= 10^(-2), maf_sample >= 0.05),
aes(x = beta_se, y = pvalue)) +
scale_y_log10()+
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex() #bins = 60
maf_sample does not seem super informative.
ggplot(chr1_pvalues_grubert,
aes(x = maf_sample, y = pvalue)) +
scale_y_log10()+
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex() #bins = 60
Here the data set 2 exhibits a much stronger trend.
maf_sample seems more informative.
ggplot(pvalues_maf_boca,
aes(x = maf_sample, y = pvalue)) +
scale_y_log10()+
scale_fill_gradient(name = "count", trans = "log10")+
geom_hex() #bins = 60
# stratifying pvalues by beta std
groups_by_filter <- function(covariate, nbins, ties.method="random", seed=NULL){
if (!is.null(seed) && ties.method=="random"){
#http://stackoverflow.com/questions/14324096/setting-seed-locally-not-globally-in-r?rq=1
tmp <- runif(1)
old <- .Random.seed
on.exit( { .Random.seed <<- old } )
set.seed(as.integer(seed))
}
rfs <- rank(covariate, ties.method=ties.method)/length(covariate)
as.factor(ceiling( rfs* nbins))
}
mutate(beta_se = beta/tstat)
ggplot(chr1_pvalues_grubert,
aes(x = pvalue)) +
geom_histogram(boundary = 0) +
facet_wrap(groups_by_filter(chr1_pvalues_grubert$beta_se, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## data set 2
ggplot(pvalues_maf_boca,
aes(x = pvalue)) +
geom_histogram(boundary = 0) +
facet_wrap(groups_by_filter(pvalues_maf_boca$beta_se, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(chr1_pvalues_grubert,
aes(x = pvalue)) +
geom_histogram(boundary = 0) +
facet_wrap(groups_by_filter(chr1_pvalues_grubert$maf_sample, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(pvalues_maf_boca,
aes(x = pvalue)) +
geom_histogram(boundary = 0) +
facet_wrap(groups_by_filter(pvalues_maf_boca$maf_sample, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.